In [ ]:
##################################################################################################
##   Notebook used for extracting text from html files. Some basic preprocessing tasks 
##   v1.0 Reading text using BeautifulSoup   
##   Required Packages: os, BeautifulSoup
##   The html files are not included in the repository
##   They can be downloaded form the following link
##   https://www.rbi.org.in/scripts/SearchResults.aspx?search=rajan&sp=speeches
##################################################################################################

In [ ]:
import os 
from bs4 import BeautifulSoup as bs

In [ ]:
## Reading all the html files in the directory
##

rootDir = 'E:\\NLP Session\\RBIGovernorSpeeches\\'

htmlFiles = [f for f in os.listdir(rootDir) if f.endswith('.html')]
htmlFiles

In [ ]:
## Selecting the first html file in the set
##

fileName = rootDir + htmlFiles[0] 
print fileName

In [ ]:
## Opening the file and converting it to a 'soup' object

soup = bs(open(fileName), 'html.parser')
soup
######################################################################################################## ## Beautifulsoup tutorial ## A short tutorial that provides an intro to the package ########################################################################################################

In [ ]:
## Prints out a pretty version of the soup
##
print(soup.prettify())

In [ ]:
## Prints the title in the page
##
soup.title

In [ ]:
## Prints the title in the page, and extracts out the string
##
soup.title.string

In [ ]:
## Prints the first paragraph in the page
##
soup.p

In [ ]:
## Prints the first paragraph in the page and extracts the string 
##
soup.p.string

In [ ]:
## Prints out all the links in the webpage
##
for link in soup.find_all('a'):
    print(link.get('href'))

In [ ]:
## Extracts the text from the soup object
##
print(soup.get_text())

In [ ]:
soup = bs(open(fileName), 'html.parser') # Parses text so that html tags can be extracted

# Removes the styling and other information
for script in soup(["script", "style","title",'[document]', 'head', 'title']):
    script.extract() 

# Extracts the text from the soup
cleaned=str(soup.get_text(separator=' ').encode('ascii','ignore'))

# Strips out the spaces
cleanedtext = cleaned.strip()
cleanedtext
The modified output that we have received in the above is far better than the text that we received by simply running the get_text() commnad on the raw soup object